Stav Suzan Rabinovich
!pip install --upgrade plotly
!pip install sweetviz
!pip install autoviz
!pip install tqdm
!pip install --upgrade scikit-learn
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import sweetviz as sw
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import norm
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
# sklearn imports
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import linear_model
from sklearn.linear_model import Lasso, Ridge, RidgeCV, ElasticNet
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import neural_network
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer
# --- #
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
# Reading in the data
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
Functions that I created to use in my code.
def fill_na_as_zero(df,columns_name): # Fill empty columns with 0
for column_name in columns_name:
fill_na_as_zero_col(df,column_name)
def fill_na_as_zero_col(df,column_name):
df_not_null = df[~df[column_name].isnull()]
df[column_name].fillna(0, inplace=True)
def create_pie_chart_of_count(df, column_name): # Pie Chart
df_not_null = df[~df[column_name].isnull()]
fig = px.pie(df_not_null.groupby([column_name]).size().reset_index(name='count'), names=column_name, values='count')
fig.show()
def create_pie_chart_with_nulls(df,column_name): # Fill pie chart with the null values
df_forChart = df.copy()
fill_na_as_zero_col(df_forChart,column_name)
create_pie_chart_of_count(df_forChart,column_name)
def create_sunburst(df,column_name,columns_arr): # Create sunburst charts of the features
df_cpy = df.copy()
df_cpy.insert(len(df_cpy.columns),column_name,1,True)
fig = px.sunburst(df_cpy, path = columns_arr, values=column_name)
fig.update_layout(margin=dict(t=10, l=10, r=10, b=10))
fig.show()
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
In this task I predicted the selling prices using linear regression, data cleansing, Gradient Boosting Regressor and also
As we can see, we have 80 features and 1460 samples on the train set and 1459 on the test set.\ I'll explore the data using the train data (df = train).
print("\nThe train data size is: {} \n".format(train.shape))
train.info()
print("\nThe test data size is: {} \n".format(test.shape))
test.info()
To make sure that there are no missing data and all data's features will be used correctly, I'll perform \ data cleansing before exploring. (there are cases when NaN or NA refers to the answer, such as: No pool).
If we will look at first, we can find some NA and NaN values,\ but in some features, the null itself resamble value (for example: No pool).
We can see a lot of missing values:
#Checking for missing data
NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]
After using Dummy Encoding, the number of missing values is really small.
From now on, we will use train_df and test_df
train_df = pd.get_dummies(train)
train_df.head()
test_df = pd.get_dummies(test)
test_df.head()
NAs = pd.concat([train_df.isnull().sum(),test_df.isnull().sum()], axis=1, keys=['Train','Test'])
NAs[NAs.sum(axis=1) > 0]
All missing features relates to exsists and non-exsists house features\ that we can fill with zeros (because they are not exsists).
# Train_df filling with zeros
fill_na_as_zero(train_df,['LotFrontage','MasVnrArea','GarageYrBlt'])
# Test_df filling with zeros
fill_na_as_zero(test_df,['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
'BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea'])
Now when there are no missing values,\ and we may continue with data exploring:
train_df.info()
print()
test_df.info()
We can see from here that all of our features are numerical (and not categorical).
We can see that SalePrice is our target (int64 value, and missing column in the test's df).\
Let's see the target variable:
# Get the fitted parameters used by the function norm
mu, sigma = norm.fit(train_df['SalePrice'])
fig = px.histogram(data_frame=train_df, x='SalePrice')
fig.update_layout(
title_text=f'SalePrice distribution - mu= {mu:.3f}, sigma= {sigma:.3f}',
xaxis_title_text='SalePrice',
yaxis_title_text='Count',
bargap=0.2, # Gap between bars of adjacent location coordinates
)
fig.show()
From this historgram, we can conclude that the most house prices are in the range of 80K to 400K.
train_report = sw.analyze(train_df,pairwise_analysis='off')
train_report.show_notebook(layout='vertical')
We can see from those graphs what values normally distribute, higher and lower corralation. \ Because we use a lot of features, we will also use another graphs.
traincpy_df = train_df.copy() # Copy of train_df
t_col = traincpy_df['SalePrice']
traincpy_df.drop('SalePrice', axis=1, inplace=True) # Just fo position this in left, easy for me to see
traincpy_df.insert(0, 'SalePrice', t_col)
traincpy_df.head()
plt.rcParams["figure.figsize"] = (7,3)
cor = traincpy_df.corr()
temp = cor['SalePrice'].sort_values(ascending = False).reset_index()
temp = temp[temp["index"] != 'SalePrice']
temp.columns = ["Variables", "Correlation"]
highly = temp[(temp["Correlation"] >= 0.25) | (temp["Correlation"] <= -0.25)]
plt.figure(figsize = (15, 10))
p = sns.barplot(highly.Correlation, highly["Variables"], palette="husl")
plt.suptitle("Target Correlation")
print(f'we have {len(highly)} of fetures with high corrleation')
Now we will remove all features that their correlation is under |0.25|
cols_to_keep = highly['Variables'].tolist()
testcpy_df = test_df[cols_to_keep]
cols_to_keep.append('SalePrice') # The target
traincpy_df = train_df[cols_to_keep]
test_cols_to_keep = cols_to_keep.copy()
traincpy_df.info()
print(f'\n\nNow traincpy_df data contain {traincpy_df.shape[1]} features\n')
# We will also keep the test's same cols
testcpy_df.info()
print(f'\n\nNow testcpy_df data contain {testcpy_df.shape[1]} features\n')
def heat_map(df):
plt.figure(figsize=(30,30))
cor = np.abs(df.corr())
sns.heatmap(cor, annot=True, cmap=plt.cm.Blues, vmin=0, vmax=1)
plt.show()
heat_map(traincpy_df)
# Splits the data to features and target
traincp_df = traincpy_df.copy()
traincp_df.drop('SalePrice',axis=1, inplace=True)
X = traincp_df
t = train_df['SalePrice'].copy()
display(X)
print(f'X contain {X.shape[1]} features \n')
Lets start with our models
hp_models = {
'SGD Regressor': SGDRegressor(random_state=1, ),
'LinearRegression': LinearRegression(),
'Ridge': make_pipeline(RobustScaler(), Ridge()),
'Lasso': make_pipeline(RobustScaler(), Lasso(random_state=1)),
'Elastic Net': make_pipeline(RobustScaler(), ElasticNet(random_state=1, alpha=0.05)),
'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
}
Now, we will write a method that will calculate CV for a model
def get_cv_score_and_loss(X, t, n_model, model, k=None, show_score_loss_graphs=False, use_pbar=True):
scores_losses_df = pd.DataFrame(columns=['fold_id', 'split', 'score', 'loss'])
if k is not None:
cv = KFold(n_splits=k, shuffle=True, random_state=1)
else:
raise ValueError('Specify k in order for the cv to work')
if use_pbar:
pbar = tqdm(desc=f'Computing Model {n_model}', total=cv.get_n_splits(X))
for i, (train_ids, val_ids) in enumerate(cv.split(X)):
X_train = X.iloc[train_ids]
X_val = X.iloc[val_ids]
t_train = t.iloc[train_ids]
t_val = t.iloc[val_ids]
model.fit(X_train, t_train)
y_train = model.predict(X_train)
y_val = model.predict(X_val)
scores_losses_df.loc[len(scores_losses_df)] = [i, 'train', model.score(X_train, t_train), mean_squared_error(t_train, y_train, squared=False)]
scores_losses_df.loc[len(scores_losses_df)] = [i, 'val', model.score(X_val, t_val), mean_squared_error(t_val, y_val, squared=False)]
if use_pbar:
pbar.update()
if use_pbar:
pbar.close()
val_scores_losses_df = scores_losses_df[scores_losses_df['split']=='val']
train_scores_losses_df = scores_losses_df[scores_losses_df['split']=='train']
score_mean_val = val_scores_losses_df['score'].mean()
loss_mean_val = val_scores_losses_df['loss'].mean()
score_mean_train = train_scores_losses_df['score'].mean()
loss_mean_train = train_scores_losses_df['loss'].mean()
fig_score = px.line(scores_losses_df, x='fold_id', y='score', color='split', title=f'Model name: {n_model}, Mean Val Score: {score_mean_val:.2f}, Mean Train Score: {score_mean_train:.2f}')
fig_loss = px.line(scores_losses_df, x='fold_id', y='loss', color='split', title=f'Model name: {n_model}, Mean Val Loss: {loss_mean_val:.2f}, Mean Train Loss: {loss_mean_train:.2f}')
if show_score_loss_graphs:
fig_loss.show()
fig_score.show()
return score_mean_val, loss_mean_val, score_mean_train, loss_mean_train, fig_score, fig_loss
def get_models_score_and_loss(X, t, models): # a DataFrame to save results
results = pd.DataFrame(columns=['mean val score', 'mean val loss', 'mean train score', 'mean train loss', 'fig_score', 'fig_loss'],
index=hp_models.keys())
pbar = tqdm(models.items(), total=len(models.keys()))
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
all_cols = np.array(X.columns)
for n_model, model in pbar:
pbar.set_description(f'Calculating model {n_model}')
preprocessor = ColumnTransformer([
("encoding", OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols), ("standard", StandardScaler(), numerical_cols)])
pipe = make_pipeline(preprocessor, model)
val_score, val_loss, train_score, train_loss, fig_score, fig_loss = get_cv_score_and_loss(X, t, n_model, pipe, k=10, show_score_loss_graphs=False, use_pbar=False)
results.loc[n_model] = [val_score, val_loss, train_score, train_loss, fig_score, fig_loss]
return results
res = get_models_score_and_loss(X, t, hp_models)
display(res.iloc[:, :-2]) # Display all results
# Display the graphs for the best one of val score
res.iloc[np.argmax(res['mean val score'])]['fig_score'].show()
res.iloc[np.argmax(res['mean val score'])]['fig_loss'].show()
We can see from this graph that Gradient Boosting Regressor gives the best result.
best_model = hp_models[res.iloc[np.argmax(res['mean val score'])].to_frame().columns[0]]
best_model_name = res.iloc[np.argmax(res['mean val score'])].name
print(best_model_name)
print(best_model)
To make sure that our features will give us the best results, I decided to use RFE from sklearn, that will use Backward Feature Selection algorithm. \
The algorithm will remove features until we will get the best results with minimum number of features.
def bw_feature_selection(X, t, n_model, model):
res_df = pd.DataFrame(columns=['features_count','score_mean_val','loss_mean_val','score_mean_train','loss_mean_train','best_features'])
pbar = tqdm(range(10, len(X.columns)+1), total=len(X.columns)+1 - 10)
for i in pbar:
pbar.set_description(f'Calculates for {i} features')
selector = RFE(model, n_features_to_select=i).fit(X, t)
score_mean_val, loss_mean_val, score_mean_train, loss_mean_train, fig_score, fig_loss = get_cv_score_and_loss(X, t, n_model, selector, k=7, use_pbar=False)
if score_mean_val < 0:
break
res_df.loc[len(res_df)] = [i, score_mean_val, loss_mean_val, score_mean_train, loss_mean_train, selector.support_]
fig = go.Figure()
fig.add_trace(go.Scatter(x=res_df['features_count'], y=res_df['score_mean_val']))
fig.update_xaxes(title_text='Number of features selected')
fig.update_yaxes(title_text='Cross validation score')
fig.show()
return res_df
results_scores = bw_feature_selection(X, t, best_model_name, best_model)
display(results_scores.iloc[:, :-1])
Now we will see what is the best features and score RFE found.
best_features = results_scores.iloc[np.argmax(results_scores['score_mean_val'])]['best_features']
num_of_best_features = results_scores.iloc[np.argmax(results_scores['score_mean_val'])]['features_count']
best_score = results_scores['score_mean_val'].max()
X_best_features = X.loc[:, best_features]
print(f'There are {num_of_best_features} features that gives {best_score} score value.')
print('So our data now looks like:')
display(X_best_features)
def learning_rate_selection(X, t):
min_learning_rate = 0.0001
max_learning_rate = 0.1
delta = 0.0001
results_df = pd.DataFrame(columns=['learning_rate', 'score_mean_val', 'loss_mean_val', 'score_mean_train', 'loss_mean_train'])
pbar = tqdm(np.arange(min_learning_rate, max_learning_rate+delta, delta), total=max_learning_rate // min_learning_rate)
for i in pbar:
pbar.set_description(f'Calculating for learning rate {i}')
model = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
init=None, learning_rate=0.05, loss='huber',
max_depth=4, max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=15, min_samples_split=10,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None,
random_state=5, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0, warm_start=False)
mean_val_score, loss_mean_val, score_mean_train, loss_mean_train, fig_score, fig_loss = get_cv_score_and_loss(X, t, 'SGDRegression', model, k=10, use_pbar=False)
if mean_val_score < 0:
break
results_df.loc[len(results_df)] = [i, mean_val_score, loss_mean_val, score_mean_train, loss_mean_train]
fig = go.Figure()
fig.add_trace(go.Scatter(x=results_df['learning_rate'], y=results_df['score_mean_val']))
fig.update_xaxes(title_text='Learning Rate')
fig.update_yaxes(title_text='Cross validation score (no. of correct classifications)')
fig.show()
return results_df
results_learning_rate = learning_rate_selection(X_best_features, t)
display(results_learning_rate)
best_learning_rate = results_learning_rate.iloc[np.argmax(results_learning_rate['score_mean_val'])]['learning_rate']
best_score_learning_rate = results_learning_rate['score_mean_val'].max()
print(f'The best learning rate is {best_learning_rate} and gained {best_score_learning_rate} of score.')
testcpy_df
final_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
numerical_cols = testcpy_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = testcpy_df.select_dtypes(include=['object', 'bool']).columns
all_cols = np.array(testcpy_df.columns)
preprocessor = ColumnTransformer([
("encoding", OrdinalEncoder(), categorical_cols),
("standard", StandardScaler(), numerical_cols)
])
y_encoded = pd.DataFrame(preprocessor.fit_transform(testcpy_df, t))
pipe = make_pipeline(preprocessor, final_model)
set_config(display='diagram')
display(pipe)
y_encoded
y = y_encoded
test_id = test['Id']
y = best_model.predict(y)
output = pd.DataFrame({'Id': test_id, 'SalePrice': y})
output.to_csv('Attempt2.csv', index=False)
output
sklearn.ensemble.GradientBoostingRegressor More about set_config method from sklearn package